#install.packages(c("ggplot2", "dplyr", "ggalluvial", "ggcorrplot", "ggridges", "reshape2", "plotly"))
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Summarize data by biome1
hmmer_jr <- read.csv("hmmer_jre.csv")
viral_counts <- hmmer_jr %>%
  group_by(biome1) %>%
  summarise(
    Viral = sum(genes_viral, na.rm = TRUE),
    Plasmid = sum(genes_plasmid, na.rm = TRUE),
    Microbial = sum(genes_microbial, na.rm = TRUE),
    Unknown = sum(genes_unknown, na.rm = TRUE)
  ) %>% pivot_longer(cols = -biome1, names_to = "Category", values_to = "Count")
# Stacked bar chart
# Remove 'Unknown' category
viral_counts_filtered <- viral_counts %>%
  filter(Category != "Unknown")

# Stacked bar chart without 'Unknown'
ggplot(viral_counts_filtered, aes(x = biome1, y = Count, fill = Category)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_minimal() +
  labs(
    x = "Biome",
    y = "Contig Count",
    fill = "Category",
    title = "Viral vs. Non-Viral Capsids Across Biomes (Filtered)"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))  # Adjusts y-axis for better visibility

# Load necessary library
library(dplyr)

# Define T-number categories based on contig length
hmmer_jr <- hmmer_jr %>%
  mutate(
    T_number = case_when(
      contig_length <= 5000 ~ "Mini",
      contig_length > 5000 & contig_length <= 10000 ~ "T=1",
      contig_length > 10000 & contig_length <= 12500 ~ "T≈2",
      contig_length > 12500 & contig_length <= 15000 ~ "T=3",
      contig_length > 15000 & contig_length <= 20000 ~ "T=4",
      TRUE ~ "Unknown"  # For any unexpected values
    )
  )

# Check if T_number is assigned properly
table(hmmer_jr$T_number)
## 
## Mini  T=1  T=3  T=4  T≈2 
##    9  105   35  450  113
ggplot(hmmer_jr, aes(x = contig_length, y = genes_viral, color = T_number)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "black") + # Trendline
  scale_x_log10() +
  theme_minimal() +
  labs(
    x = "Capsid Size (bp, Log Scale)",
    y = "Number of Viral Genes",
    color = "T-number",
    title = "Capsid Size vs. Viral Gene Content"
  )
## `geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Count occurrences of each T-number in each biome
t_number_counts <- hmmer_jr %>%
  count(biome1, T_number) %>%
  pivot_wider(names_from = T_number, values_from = n, values_fill = 0)

# Convert to long format for heatmap
t_number_melt <- melt(t_number_counts, id.vars = "biome1")

# Heatmap
ggplot(t_number_melt, aes(x = biome1, y = variable, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "gray", high = "orange") +
  theme_minimal() +
  labs(
    x = "Biome",
    y = "T-number",
    fill = "Frequency",
    title = "T-number Distribution Across Biomes"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(hmmer_jr, aes(x = biome1, y = contig_length, fill = biome1)) +
  geom_violin(alpha = 0.6) +
  geom_boxplot(width = 0.2, color = "black") + # Add boxplot inside violin
  scale_y_log10() +
  theme_minimal() +
  labs(
    x = "Biome",
    y = "Capsid Length (bp, Log Scale)",
    fill = "Biome",
    title = "Capsid Length Distributions by Biome"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Select relevant columns
capsid_data <- hmmer_jr %>%
  select(T_number, genes_viral, genes_plasmid, genes_microbial, genes_unknown)

# Parallel coordinates plot
ggparcoord(capsid_data, columns = 2:5, groupColumn = 1, scale = "globalminmax") +
  theme_minimal() +
  labs(
    x = "Genomic Features",
    y = "Scaled Values",
    title = "T-number vs. Genome Composition"
  )

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Aggregate counts
sunburst_data <- hmmer_jr %>%
  count(biome1, biome2, T_number)

# Create sunburst plot
plot_ly(sunburst_data, labels = ~biome2, parents = ~biome1, values = ~n, type = 'sunburst')
print(sunburst_data)
##             biome1            biome2 T_number   n
## 1          Aquatic        Freshwater     Mini   8
## 2          Aquatic        Freshwater      T=1   4
## 3          Aquatic        Freshwater      T=3   6
## 4          Aquatic        Freshwater      T=4  66
## 5          Aquatic        Freshwater      T≈2  43
## 6          Aquatic            Marine      T=1  11
## 7          Aquatic            Marine      T=3  10
## 8          Aquatic            Marine      T=4   6
## 9          Aquatic            Marine      T≈2  25
## 10         Aquatic Non_marine saline      T=1  15
## 11         Aquatic Non_marine saline      T=3   3
## 12         Aquatic Non_marine saline      T≈2   3
## 13         Aquatic          Sediment      T≈2   2
## 14      Engineered        Bioreactor      T=4   2
## 15      Engineered       Solid waste      T≈2   3
## 16      Engineered        Wastewater      T=1   2
## 17      Engineered        Wastewater      T=3   2
## 18      Engineered        Wastewater      T=4   5
## 19      Engineered        Wastewater      T≈2   4
## 20 Host_Associated             Human     Mini   1
## 21 Host_Associated             Human      T=1  71
## 22 Host_Associated             Human      T=3   4
## 23 Host_Associated             Human      T=4 356
## 24 Host_Associated             Human      T≈2  30
## 25 Host_Associated           Mammals      T=3   4
## 26 Host_Associated           Mammals      T=4   2
## 27 Host_Associated           Mammals      T≈2   1
## 28 Host_Associated            Plants      T=3   4
## 29 Host_Associated            Plants      T=4   1
## 30     Terrestrial   Deep subsurface      T=4   1
## 31     Terrestrial   Deep subsurface      T≈2   1
## 32     Terrestrial              Soil      T=1   2
## 33     Terrestrial              Soil      T=3   2
## 34     Terrestrial              Soil      T=4   6
## 35     Terrestrial              Soil      T≈2   1
## 36            <NA>              <NA>      T=4   5
ggplot(hmmer_jr, aes(x = contig_length, y = dtr_length, color = T_number)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  theme_minimal() +
  labs(
    x = "Capsid Length (bp)",
    y = "DTR Length (bp)",
    color = "T-number",
    title = "DTR Length vs. Capsid Type"
  )
## `geom_smooth()` using formula = 'y ~ x'